Render this script with > sbatch ~/spinal_cord_paper/scripts/Gg_all_int_render.sh

library(Seurat)
library(magrittr)
library(dplyr)
library(Rtsne)
library(RColorBrewer)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(patchwork)
library(cowplot)
library(ggdendro)

Integration

the following workflow is based on:

my.samples <- list()

my.samples[[1]] <- readRDS("~/spinal_cord_paper/data/Gg_ctrl_1_seurat_070323.rds")
my.samples[[2]] <- readRDS("~/spinal_cord_paper/data/Gg_ctrl_2_seurat_070323.rds")
my.samples[[3]] <- readRDS("~/spinal_cord_paper/data/Gg_lumb_1_seurat_070323.rds")
my.samples[[4]] <- readRDS("~/spinal_cord_paper/data/Gg_lumb_2_seurat_070323.rds")
my.samples[[5]] <- readRDS("~/spinal_cord_paper/data/Gg_poly_1_seurat_070323.rds")
my.samples[[6]] <- readRDS("~/spinal_cord_paper/data/Gg_poly_2_seurat_070323.rds")
my.samples[[7]] <- readRDS("~/spinal_cord_paper/data/Gg_D05_ctrl_seurat_070323.rds")
my.samples[[8]] <- readRDS("~/spinal_cord_paper/data/Gg_D07_ctrl_seurat_070323.rds")

names(my.samples) <- c("Gg_ctrl_1", "Gg_ctrl_1", "Gg_lumb_1", "Gg_lumb_2", "Gg_poly_1", "Gg_poly_2", "Gg_ctrl_D05", "Gg_ctrl_D07")
# integration features
features <- SelectIntegrationFeatures(
    object.list = my.samples,
    nfeatures = 3000
    )

# prepare integration 
my.samples <- PrepSCTIntegration(
    object.list = my.samples,
    anchor.features = features
    )

# find anchors
NT.anchors <- FindIntegrationAnchors(
    object.list = my.samples,
    normalization.method = "SCT",
    anchor.features = features
    )

rm(my.samples)

# integrate
my.int <- IntegrateData(
    anchorset = NT.anchors,
    normalization.method = "SCT"
    )

# compute PCA
my.int <- RunPCA(
    my.int,
    verbose = FALSE
    )

# Run tSNE
my.int <- RunTSNE(
    my.int,
    reduction = "pca",
    dims = 1:30
    )
saveRDS(my.int, file = paste0("~/spinal_cord_paper/data/Gg_all_seurat_",format(Sys.Date(), "%d%m%y"),".rds"))

Integrated Seurat

Load the data

my.int <- readRDS("~/spinal_cord_paper/data/Gg_all_seurat_220524.rds")
my.int@project.name <- "Gg_all_int"

Inspect the data

PCAPlot(my.int, group.by = "orig.ident")

DimPlot(my.int, reduction = "tsne", group.by = "orig.ident")

DimPlot(my.int, reduction = "tsne", split.by = "orig.ident", ncol = 4)

Violin Plot by Sample

Use Counts (UMI) and Features (Genes) from the raw data (RNA assay).

meta_int <- my.int@meta.data %>% 
  mutate(orig.ident = factor(
    orig.ident,
    levels = c("Gg_D05_ctrl", "Gg_D07_ctrl",
               "Gg_ctrl_1", "Gg_ctrl_2",
               "Gg_lumb_1", "Gg_lumb_2",
               "Gg_poly_1", "Gg_poly_2")))

vln_pm <- ggplot(meta_int, aes(x = orig.ident, y = percent.mt, fill = orig.ident)) +
  geom_violin() +
  geom_boxplot(width = 0.1) +
  theme_cowplot() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45,
                                   vjust = 1,
                                   hjust = 1)
        ) 
      
vln_nC <- ggplot(meta_int, aes(x = orig.ident, y = nCount_RNA, fill = orig.ident)) +
  geom_violin() +
  geom_boxplot(width = 0.1) +
  theme_cowplot() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45,
                                   vjust = 1,
                                   hjust = 1)
        )
        
vln_cF <- ggplot(meta_int, aes(x = orig.ident, y = nFeature_RNA, fill = orig.ident)) +
  geom_violin() +
  geom_boxplot(width = 0.1) +
  theme_cowplot() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45,
                                   vjust = 1,
                                   hjust = 1)
        )

vln_pm + vln_nC + vln_cF

pdf("~/spinal_cord_paper/figures/Supp_Fig_1_Gg_all_vln.pdf", width = 15, height = 8)
vln_pm + vln_nC + vln_cF

Cell cycle scoring

# Load the orhtology table
ortho_gg_mm_v102 <- readRDS("~/spinal_cord_paper/data/ortho_gg_mm_v102.rds") 

colnames(ortho_gg_mm_v102) <- c(
  "GG_gene_ID",
  "GG_gene_Name",
  "MM_gene_ID",
  "MM_gene_Name",
  "ortho_conf",
  "homolog_type"
)

# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat.  We can
# segregate this list into markers of G2/M phase and markers of S phase
s.genes <- ortho_gg_mm_v102 %>%
  dplyr::mutate(MM_gene_Name = toupper(MM_gene_Name)) %>%
  dplyr::filter(MM_gene_Name %in% cc.genes$s.genes) %>%
  dplyr::arrange(match(MM_gene_Name, cc.genes$s.genes)) %>%
  dplyr::pull(GG_gene_ID)

g2m.genes <- ortho_gg_mm_v102 %>%
  dplyr::mutate(MM_gene_Name = toupper(MM_gene_Name)) %>%
  dplyr::filter(MM_gene_Name %in% cc.genes$g2m.genes) %>%
  dplyr::arrange(match(MM_gene_Name, cc.genes$g2m.genes)) %>%
  dplyr::pull(GG_gene_ID) 

t0 <- Sys.time()
my.int <- CellCycleScoring(
  my.int,
  s.features = s.genes,
  g2m.features = g2m.genes,
  set.ident = TRUE
  )
paste0("Seurats CC scoring took ", Sys.time() - t0, " seconds to run.")

my.int$CC.Difference.seurat <- my.int$S.Score - my.int$G2M.Score
# view cell cycle scores and phase assignments
head(my.int[[]])

rm(t0, s.genes, g2m.genes)

PCA plot of non-regressed data

PCAPlot(my.int, group.by = "Phase")

rescale integrated assay

We rescale the data to regress out the CC.difference

all_genes <- rownames(my.int)

my.int <- ScaleData(
  my.int,
  features = all_genes,
  vars.to.regress = "CC.Difference.seurat"
)
## Regressing out CC.Difference.seurat
## Centering and scaling data matrix

PCA

# Run the actual PCA (by default uses var.features of assay)
my.int <- RunPCA(my.int, verbose = F)

PCAPlot(my.int, group.by = "Phase")

# Which dimensions will we choose?
hist(my.int@reductions$pca@stdev^2, breaks = 500)

my.dimensions=1:20

Dim Reduction

We will run the tSNE. Using the FFT-accelerated Interpolation-based t-SNE (FIt-SNE). We run two, a normal tSNE, and an exaggerated tSNE, where clusters are tighter togheter. This is located under the xtsne name.

# Get the tSNE function
source("~/Software/FIt-SNE-master/fast_tsne.R")

my.tsne = fftRtsne(my.int@reductions$pca@cell.embeddings[,my.dimensions],
                   max_iter = 1500,
                   learning_rate =  round(dim(my.int)[2]/12),
                   initialization =(my.int@reductions$pca@cell.embeddings[,1:2]/my.int@reductions$pca@stdev[1])*0.0001,
                   perplexity_list = c(25, round(dim(my.int)[2]/100)),
                   fast_tsne_path="~/Software/FIt-SNE-master/bin/fast_tsne")

colnames(my.tsne) = c("tsne_1", "tsne_2")
rownames(my.tsne) = colnames(my.int)

my.int[["tsne"]] = CreateDimReducObject(embeddings = my.tsne, key = "tsne_", assay = DefaultAssay(my.int), global = T)

Clustering

Here we perform the Louvain-jaccard clustering implemented in Seurat. We can see the tree of clusters, to see how the clusters relate in the PCA space.
We also use the tree, to check if any of the terminal pairs of sisters should be merged. This is determined based on a minimum of 20 DEs between the clusters.

# Find first the nearest neighbors
my.int <- FindNeighbors(object = my.int, dims=my.dimensions, verbose = F)

# Then the actual clusters
my.int <- FindClusters(object = my.int, resolution = 1, verbose = F, random.seed = 42)

# Check the tree of clusters, to see what's the relationship between them
my.int <- BuildClusterTree(my.int, dims = my.dimensions, verbose = F)
plot(Tool(object = my.int, slot = 'BuildClusterTree'))

# We are gonna check for DEs using the non integrated data. We are only gonna test genes that have variability, so we calculate variable genes
my.int <- FindVariableFeatures(my.int, assay = "RNA")


# Only the genes with variability > median
my.HVF <- HVFInfo(my.int, assay = "RNA")
my.HVF <- rownames(my.HVF)[which(my.HVF[,3] > (median(my.HVF[,3])))]
# We check for pairs of clusters, how mayn DEs they have. If less than n, we merge them
keep.check <- T
while (keep.check == T) {
  
  keep.check <- F
  # Check the tree of clusters, to see what's the relationship between them
  my.int <- BuildClusterTree(my.int, dims = my.dimensions, verbose = F)
  # Check only the terminal sisters
  to.check = ips::terminalSisters(my.int@tools$BuildClusterTree)
  for (i in to.check) {
    # DE between the sisters
    my.DE <- FindMarkers(my.int, i[1], i[2], test.use = "MAST", latent.vars = c("CC.Difference.seurat"),
                         min.pct = 0.25, verbose = T, assay = "RNA", features = my.HVF,) %>%
      dplyr::filter(abs(avg_log2FC) > 0.5) %>%
      dplyr::filter(p_val_adj < 0.05)
    
    # If less than 5, merge, and repeat
    if (dim(my.DE)[1] < 5) {
      cat(paste0(dim(my.DE)[1], " genes differentially expressed between clusters ",i[1]," and ",i[2]," merging \n"))
      my.int <- SetIdent(my.int, cells = WhichCells(my.int, idents = i[2]), value = i[1])
      keep.check <- T
    }
    print(i)
  }
}
rm(to.check, my.HVF, my.DE)
# renumber starting from 1
my.ID <- factor(
  Idents(my.int),
  levels= levels(Idents(my.int))[base::order(as.numeric(levels(Idents(my.int))))])
levels(my.ID) <- seq(length(levels(my.ID)))
Idents(my.int) <- my.ID
my.int[["seurat_clusters"]] <- my.ID

# Check again the clusters
dim1 <- DimPlot(my.int, reduction = "tsne", cols = rainbow(length(levels(my.ID))), label = T, label.size = 5, pt.size = 1)
dim1

Now that we have the final reductions, we’ll choose one and we look at the statistics of the cells.

modplots::mFeaturePlot(my.int, 
                       my.features = c("OLIG2", "SOX9", "TUBB3", "SHH",
                                       "SLC18A3", "GATA2", "PAX2", "TLX3",
                                       "IGFBP7", "HBBA", "IFI30", "MSX2", "PLP1"),
                       gnames = modplots::gnames)
## 

# set and get dim.reduct embeddings
my.reduc <- "tsne"
emb <- data.frame(Embeddings(my.int, my.reduc))
colnames(emb) <- c("reduc_1", "reduc_2")

meta <- my.int[[]] %>%
  tibble::rownames_to_column("cell_ID")


my.plots = list()

my.plots[[1]] = ggplot(emb[meta[order(meta$nCount_RNA),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=sort(meta$nCount_RNA)), size=1, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="UMI count", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

my.plots[[2]] = ggplot(emb[meta[order(meta$nFeature_RNA),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=sort(meta$nFeature_RNA)), size=1, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="gene count", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

my.plots[[3]] = ggplot(emb[meta[order(meta$nCount_SCT),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=sort(meta$nCount_RNA)), size=1, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="UMI count (SCT)", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

my.plots[[4]] = ggplot(emb[meta[order(meta$nFeature_SCT),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=sort(meta$nFeature_RNA)), size=1, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="gene count (SCT)", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

my.plots[[5]] = ggplot(emb[meta[order(meta$percent.mt),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=(sort(meta$percent.mt))), size=1, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="log1p MT percent", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

my.plots[[6]] = ggplot(emb[meta[order(meta$CC.Difference.seurat),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=sort(meta$CC.Difference.seurat)), size=1, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="Cell Cycle\nS-G2M", x = paste0(my.reduc, "_1"), y = paste0(my.reduc, "_2"))

my.plots[[7]] = ggplot(emb[meta[order(meta$percent.rb),]$cell_ID, ], aes(x = reduc_1, y = reduc_2)) +
    geom_point(aes(color=sort(meta$percent.rb)), size=2, alpha = 0.4, pch = 19) + 
    scale_colour_gradientn(colours = c("gray90","gray90","gray80","yellow", "orange", "red", "darkred", "darkred")) +
    theme_classic() + labs(colour="percent.rb")

grid.arrange(grobs=my.plots, ncol=2)

DE analysis

Here we do the differential expression analysis, and end up with the marker genes lists. We can also see the marker gene dot plot, for the top 2 marker genes per cluster

# Find all the marker genes, with these thresholds MAST
semarkers = FindAllMarkers(my.int,
                           features = my.int[["integrated"]]@var.features, 
                           only.pos = TRUE, 
                           min.pct = 0.25, 
                           logfc.threshold =  0.5,
                           latent.vars = c("CC.Difference.seurat"), 
                           test.use = "MAST", 
                           assay = "RNA", 
                           return.thresh = 0.05)

# We only keep the significant ones
semarkers <- semarkers %>%
  filter(p_val_adj < 0.05) %>% 
  rename(Gene.stable.ID = gene) %>% 
  left_join(modplots::gnames, by = "Gene.stable.ID")


# Take only the top 10
semrk10 = semarkers %>% group_by(cluster) %>% top_n(-10, p_val_adj)
semrk1 = semarkers %>% group_by(cluster) %>% top_n(-1, p_val_adj)

modplots::mDotPlot2(my.int, 
          features = unique(semrk1$Gene.stable.ID), 
          cols = c("grey", "black"),  
          gnames = modplots::gnames, dot.scale = 6) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

save

saveRDS(
  my.int,
  paste0(
    "~/spinal_cord_paper/data/",
    my.int@project.name,
    "_seurat_",
    format(Sys.Date(), "%d%m%y"),
    ".rds"
  )
)

write.table(
  semarkers,
  sep = "\t",
  row.names = T,
  col.names = T,
  file = paste0(
    "~/spinal_cord_paper/data/",
    my.int@project.name,
    "_fullDE_",
    format(Sys.Date(), "%d%m%y"),
    ".txt"
  ),
  quote = F
)

write.table(
  semrk10,
  sep = "\t",
  row.names = T,
  col.names = T,
  file = paste0(
    "~/spinal_cord_paper/data/",
    my.int@project.name,
    "_top10DE_",
    format(Sys.Date(), "%d%m%y"),
    ".txt"
  ),
  quote = F
)
# Date and time of Rendering
Sys.time()
## [1] "2024-05-27 19:06:33 CEST"
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /scicore/soft/apps/OpenBLAS/0.3.1-GCC-7.3.0-2.30/lib/libopenblas_sandybridgep-r0.3.1.so
## 
## locale:
## [1] en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggdendro_0.1.22    cowplot_1.1.1      patchwork_1.1.1    gridExtra_2.3     
##  [5] ggplot2_3.3.3      tidyr_1.1.3        RColorBrewer_1.1-2 Rtsne_0.15        
##  [9] dplyr_1.0.10       magrittr_2.0.1     SeuratObject_4.0.2 Seurat_4.0.5      
## 
## loaded via a namespace (and not attached):
##   [1] fastmatch_1.1-0             plyr_1.8.6                 
##   [3] igraph_1.2.6                lazyeval_0.2.2             
##   [5] sp_1.4-5                    splines_4.1.0              
##   [7] listenv_0.8.0               scattermore_0.7            
##   [9] GenomeInfoDb_1.28.0         digest_0.6.27              
##  [11] htmltools_0.5.1.1           ips_0.0.11                 
##  [13] fansi_0.5.0                 memoise_2.0.0              
##  [15] tensor_1.5                  cluster_2.1.2              
##  [17] ROCR_1.0-11                 Biostrings_2.60.0          
##  [19] globals_0.16.2              matrixStats_0.58.0         
##  [21] modplots_1.0.0              spatstat.sparse_3.0-0      
##  [23] prettyunits_1.1.1           colorspace_2.0-1           
##  [25] blob_1.2.1                  ggrepel_0.9.1              
##  [27] xfun_0.34                   crayon_1.4.1               
##  [29] RCurl_1.98-1.3              jsonlite_1.7.2             
##  [31] spatstat.data_3.0-0         survival_3.2-11            
##  [33] zoo_1.8-9                   phangorn_2.7.0             
##  [35] ape_5.5                     glue_1.6.2                 
##  [37] polyclip_1.10-0             gtable_0.3.0               
##  [39] zlibbioc_1.38.0             XVector_0.32.0             
##  [41] leiden_0.3.9                DelayedArray_0.18.0        
##  [43] future.apply_1.7.0          SingleCellExperiment_1.14.1
##  [45] BiocGenerics_0.38.0         abind_1.4-5                
##  [47] scales_1.1.1                pheatmap_1.0.12            
##  [49] DBI_1.1.1                   miniUI_0.1.1.1             
##  [51] Rcpp_1.0.7                  progress_1.2.2             
##  [53] viridisLite_0.4.0           xtable_1.8-4               
##  [55] reticulate_1.22             spatstat.core_2.1-2        
##  [57] bit_4.0.4                   stats4_4.1.0               
##  [59] htmlwidgets_1.5.3           httr_1.4.2                 
##  [61] ellipsis_0.3.2              ica_1.0-2                  
##  [63] pkgconfig_2.0.3             XML_3.99-0.6               
##  [65] farver_2.1.0                sass_0.4.0                 
##  [67] uwot_0.1.10                 deldir_1.0-6               
##  [69] utf8_1.2.1                  AnnotationDbi_1.54.0       
##  [71] tidyselect_1.2.0            labeling_0.4.2             
##  [73] rlang_1.0.6                 reshape2_1.4.4             
##  [75] later_1.2.0                 cachem_1.0.5               
##  [77] munsell_0.5.0               tools_4.1.0                
##  [79] cli_3.4.1                   RSQLite_2.2.7              
##  [81] generics_0.1.3              ggridges_0.5.3             
##  [83] org.Gg.eg.db_3.13.0         evaluate_0.20              
##  [85] stringr_1.4.0               fastmap_1.1.0              
##  [87] yaml_2.2.1                  goftest_1.2-2              
##  [89] bit64_4.0.5                 knitr_1.41                 
##  [91] fitdistrplus_1.1-6          purrr_0.3.4                
##  [93] RANN_2.6.1                  KEGGREST_1.32.0            
##  [95] pbapply_1.4-3               future_1.30.0              
##  [97] nlme_3.1-152                mime_0.10                  
##  [99] compiler_4.1.0              plotly_4.10.0              
## [101] png_0.1-7                   spatstat.utils_3.0-1       
## [103] tibble_3.1.8                bslib_0.2.5.1              
## [105] stringi_1.6.2               highr_0.9                  
## [107] lattice_0.20-44             Matrix_1.3-3               
## [109] vctrs_0.5.1                 pillar_1.8.1               
## [111] lifecycle_1.0.3             spatstat.geom_3.0-3        
## [113] lmtest_0.9-38               jquerylib_0.1.4            
## [115] RcppAnnoy_0.0.19            data.table_1.14.0          
## [117] bitops_1.0-7                irlba_2.3.3                
## [119] GenomicRanges_1.44.0        httpuv_1.6.1               
## [121] R6_2.5.0                    promises_1.2.0.1           
## [123] KernSmooth_2.23-20          IRanges_2.26.0             
## [125] parallelly_1.33.0           codetools_0.2-18           
## [127] MASS_7.3-54                 assertthat_0.2.1           
## [129] SummarizedExperiment_1.22.0 MAST_1.18.0                
## [131] withr_2.4.2                 sctransform_0.3.3          
## [133] GenomeInfoDbData_1.2.6      S4Vectors_0.30.0           
## [135] hms_1.1.0                   mgcv_1.8-35                
## [137] parallel_4.1.0              quadprog_1.5-8             
## [139] grid_4.1.0                  rpart_4.1-15               
## [141] rmarkdown_2.17              MatrixGenerics_1.4.0       
## [143] Biobase_2.52.0              shiny_1.6.0